/**
 *  JFTSTool - Java Full Text Search tool.
 *  Copyright (C) <2009>  <grupo JFTS>
 *  This program is free software: you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License as published by
 *	the Free Software Foundation, either version 3 of the License, or
 *	(at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.jftstool.analyzers.filters;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

/**
 * This filter is responsible for removing the accents to words. <br>
 * Replace the letter with an accent on the unaccented equivalent.
 * @author grupo JFTS
 *
 */
public class AccentFilter extends TokenFilter {
	private char[] output = new char[256];
	private int outputPos;

	
	public AccentFilter(TokenStream input) {
		super(input);
	}

	
	public final Token next(final Token reusableToken) throws java.io.IOException {
		assert reusableToken != null;
		Token nextToken = input.next(reusableToken);
		if (nextToken != null) {
			final char[] buffer = nextToken.termBuffer();
			final int length = nextToken.termLength();
			for(int i=0;i<length;i++) {
				final char c = buffer[i];
				if (c >= '\u00c0' && c <= '\uFB06') {
					removeAccents(buffer, length);
					nextToken.setTermBuffer(output, 0, outputPos);
					break;
				}
			}
			return nextToken;
		}
		else{
			return null;
		}
	}
	
	
	/**
   * To replace accented characters in a String by unaccented equivalents.
   */
	public final void removeAccents(char[] input, int length) {
		// Worst-case length required:
		final int maxSizeNeeded = 2*length;

		int size = output.length;
		while (size < maxSizeNeeded){
			size *= 2;
		}

		if (size != output.length){
			output = new char[size];
		}

		outputPos = 0;

		int pos = 0;

		for (int i=0; i<length; i++, pos++) {
			final char c = input[pos];
			
			if (c < '\u00c0' || c > '\uFB06'){
				output[outputPos++] = c;
			}
			else {
				switch (c) {
					case '\u00C0' : 
					case '\u00C1' : 
			        case '\u00C2' : 
			        case '\u00C3' : 
			        case '\u00C4' : 
			        case '\u00C5' : 
			          output[outputPos++] = 'A';
			          break;
			        case '\u00C6' : 
			          output[outputPos++] = 'A';
			          output[outputPos++] = 'E';
			          break;
			        case '\u00C7' : 
			          output[outputPos++] = 'C';
			          break;
			        case '\u00C8' : 
			        case '\u00C9' : 
			        case '\u00CA' : 
			        case '\u00CB' : 
			          output[outputPos++] = 'E';
			          break;
			        case '\u00CC' : 
			        case '\u00CD' : 
			        case '\u00CE' : 
			        case '\u00CF' : 
			          output[outputPos++] = 'I';
			          break;
			        case '\u0132' : 
			            output[outputPos++] = 'I';
			            output[outputPos++] = 'J';
			            break;
			        case '\u00D0' : 
			          output[outputPos++] = 'D';
			          break;
			        case '\u00D1' : 
			          output[outputPos++] = 'N';
			          break;
			        case '\u00D2' : 
			        case '\u00D3' : 
			        case '\u00D4' : 
			        case '\u00D5' : 
			        case '\u00D6' : 
			        case '\u00D8' : 
			          output[outputPos++] = 'O';
			          break;
			        case '\u0152' : 
			          output[outputPos++] = 'O';
			          output[outputPos++] = 'E';
			          break;
			        case '\u00DE' : 
			          output[outputPos++] = 'T';
			          output[outputPos++] = 'H';
			          break;
			        case '\u00D9' : 
			        case '\u00DA' : 
			        case '\u00DB' : 
			        case '\u00DC' : 
			          output[outputPos++] = 'U';
			          break;
			        case '\u00DD' : 
			        case '\u0178' : 
			          output[outputPos++] = 'Y';
			          break;
			        case '\u00E0' : 
			        case '\u00E1' : 
			        case '\u00E2' : 
			        case '\u00E3' : 
			        case '\u00E4' : 
			        case '\u00E5' : 
			          output[outputPos++] = 'a';
			          break;
			        case '\u00E6' : 
			          output[outputPos++] = 'a';
			          output[outputPos++] = 'e';
			          break;
			        case '\u00E7' : 
			          output[outputPos++] = 'c';
			          break;
			        case '\u00E8' : 
			        case '\u00E9' : 
			        case '\u00EA' : 
			        case '\u00EB' : 
			          output[outputPos++] = 'e';
			          break;
			        case '\u00EC' : 
			        case '\u00ED' : 
			        case '\u00EE' : 
			        case '\u00EF' : 
			          output[outputPos++] = 'i';
			          break;
			        case '\u0133' : 
			            output[outputPos++] = 'i';
			            output[outputPos++] = 'j';
			            break;
			        case '\u00F0' : 
			          output[outputPos++] = 'd';
			          break;
			        case '\u00F1' : 
			          output[outputPos++] = 'n';
			          break;
			        case '\u00F2' : 
			        case '\u00F3' : 
			        case '\u00F4' : 
			        case '\u00F5' : 
			        case '\u00F6' : 
			        case '\u00F8' : 
			          output[outputPos++] = 'o';
			          break;
			        case '\u0153' : 
			          output[outputPos++] = 'o';
			          output[outputPos++] = 'e';
			          break;
			        case '\u00DF' : 
			          output[outputPos++] = 's';
			          output[outputPos++] = 's';
			          break;
			        case '\u00FE' : 
			          output[outputPos++] = 't';
			          output[outputPos++] = 'h';
			          break;
			        case '\u00F9' : 
			        case '\u00FA' : 
			        case '\u00FB' : 
			        case '\u00FC' : 
			          output[outputPos++] = 'u';
			          break;
			        case '\u00FD' : 
			        case '\u00FF' : 
			          output[outputPos++] = 'y';
			          break;
			        case '\uFB00': 
			            output[outputPos++] = 'f';
			            output[outputPos++] = 'f';
			            break;
			        case '\uFB01': 
			            output[outputPos++] = 'f';
			            output[outputPos++] = 'i';
			            break;
			        case '\uFB02': 
			            output[outputPos++] = 'f';
			            output[outputPos++] = 'l';
			            break;
			        case '\uFB05': 
			            output[outputPos++] = 'f';
			            output[outputPos++] = 't';
			            break;
			        case '\uFB06': 
			            output[outputPos++] = 's';
			            output[outputPos++] = 't';
			        	break;
			        default :
			          output[outputPos++] = c;
			          break;
			    }
			}
		}
	}
}